import pandas as pd
from pdf2docx import Converter
import re

pdf_file = r"C:\Users\kf40\Desktop\智能提取信息\1617148857055-dongfanghongwenjianjingxuanhunhexingzhengquantouzijijin2020nianniandubaogao.pdf"
docx_file = r'C:\Users\kf40\Desktop\智能提取信息\sample02.docx'

# convert pdf to docx
cv = Converter(pdf_file)
cv.convert(docx_file, start=0, end=None)
cv.close()

#读取docx中的文本代码示例
import docx
#获取文档对象
file=docx.Document(r"C:\Users\kf40\Desktop\智能提取信息\sample01.docx")
print("段落数:"+str(len(file.paragraphs)))#段落数为13，每个回车隔离一段

#输出每一段的内容
for para in file.paragraphs:
    print(para.text)
#
# #输出段落编号及段落内容
# for i in range(len(file.paragraphs)):
#     print("第"+str(i)+"段的内容是："+file.paragraphs[i].text)

tables = file.tables
keyword_dict = {}
key_list = ['基金名称','基金简称','场内简称','基金主代码','基金管理人','基金托管人','基金合同生效日','基金运作方式','投资目标','业绩比较标准','风险收益特征','产品沿革']
for i in range(len(tables)):
    tb=tables[i]
    #获取表格的行
    tb_rows=tb.rows
    #读取每一行内容
    for i in range(len(tb_rows)):
        row_data=[]
        row_cells=tb_rows[i].cells
        #读取每一行单元格内容
        for cell in row_cells:
            #单元格内容
            row_data.append(cell.text.replace(" ",""))
        if row_data[0] in key_list:
            keyword_dict[row_data[0]] = row_data[1]
        print(row_data)
display(keyword_dict)
df_keyword = pd.DataFrame(pd.Series(keyword_dict), columns=['Description'])
df_keyword = df_keyword.rename(index={'基金主代码':'基金代码'})
display(df_keyword)
Fund_list=["股票型投资基金","混合型证券投资基金","混合型基金中基金","债券型证券投资基金","货币市场基金"]
temp = keyword_dict['基金简称']
if '发起式' in temp:
    if '混合型' in temp:
        df_keyword['基金类别'] = "混合型证券投资基金"
    else:
        df_keyword['基金类别'] = "股票型投资基金"
else:
    if 'FOF' in temp:
        df_keyword['基金类别'] = "混合型基金中基金"
    elif '债券型' in temp:
        df_keyword['基金类别'] = "债券型证券投资基金"
    elif "货币市场" in temp:
        df_keyword['基金类别'] = "货币市场基金"
    else:
        df_keyword['基金类别'] = "混合型证券投资基金"

#输出每一段的内容
a =[]
for para in file.paragraphs:
    # print(para.text)
    word = re.findall(r'本基金的投资范围.*',para.text)
        if word != []:
            print(word)
print(a)
x = file.paragraphs
text=""
for i in range(len(file.paragraphs)):
    text = text + file.paragraphs[i].text
word = re.findall(r'本基金的投资范围.*?投资范围',text)
print(word)